#!/bin/bash
# Time: 2022-12-28 22:03:57
# Desc: 压力测试控制脚本

TST_STRESS_DIR="$TST_TS_TOPDIR/logs/stress-$(date '+%Y%m%d-%H%M%S-%N')"
export TST_STRESS_DIR
export TST_STRESS_NR_STORE=100
export TST_STRESS_MAX_LOG=1000
export TST_STRESS_CONFIG_INTERVAL=60
export TST_STRESS_RUNNING_INTERVAL=60
export TST_STRESS_LOOP_INTERVAL=1
export TST_STRESS_MODE=normal
export TST_STRESS_STOP_SIGNAL=12
# TST_STRESS_DIR目录结构：

# 判断压力测试是否需要继续运行
is_stress_running() {
    test -e "$(cat "$TST_STRESS_DIR/flag.run" 2>/dev/null)"
}

# 处理config文件，将需要执行的命令行提取出来，生成对应的执行文件
# 返回值：0 -- config文件有变更，1 -- 文件无变化
process_config() {
    config_index=0
    # 尝试保留每次的config文件
    while :; do
        config_index=$((config_index + 1))
        [ -e "$TST_STRESS_DIR/config/${config_index}.conf" ] || break
    done
    if [ -f "$TST_STRESS_DIR/test.config" ]; then
        cp "$TST_STRESS_DIR/test.config" "$TST_STRESS_DIR/config/tmp.conf"
        # 怕原来的文件没有换行符，加一个换行
        echo >>"$TST_STRESS_DIR/config/tmp.conf"
    else
        echo "# config not exist" >"$TST_STRESS_DIR/config/tmp.conf"
    fi

    md5sum "$TST_STRESS_DIR/config/tmp.conf" | awk '{print $1}' >"$TST_STRESS_DIR/config/now.md5"
    # 配置文件没有被修改就不用处理了，直接返回
    diff "$TST_STRESS_DIR/config/last.md5" "$TST_STRESS_DIR/config/now.md5" >/dev/null 2>&1 && return 1
    cp "$TST_STRESS_DIR/config/tmp.conf" "$TST_STRESS_DIR/config/${config_index}.conf"
    grep -v "^[[:blank:]]*#" "$TST_STRESS_DIR/config/tmp.conf" | grep -v "^[[:blank:]]*$" | sed "s|^[[:blank:]]*||g" |
        sed "s|[[:blank:]]*$||g" | sort >"$TST_STRESS_DIR/config/clean.save.conf"

    local md5_last
    local md5_now
    local cmd_index=1
    cp "$TST_STRESS_DIR/config/clean.save.conf" "$TST_STRESS_DIR/config/clean.conf"
    while true; do
        [ -z "$(sed -n "1p" "$TST_STRESS_DIR/config/clean.conf")" ] && break
        echo "#!/bin/bash" >"$TST_STRESS_DIR/config/tmp.sh"
        sed -n "1p" "$TST_STRESS_DIR/config/clean.conf" >>"$TST_STRESS_DIR/config/tmp.sh"
        sed -i "1d" "$TST_STRESS_DIR/config/clean.conf"
        md5_now=$(md5sum "$TST_STRESS_DIR/config/tmp.sh" | awk '{print $1}')
        if [ "$md5_last" == "$md5_now" ]; then
            cmd_index=$((cmd_index + 1))
        else
            cmd_index=1
        fi
        mv "$TST_STRESS_DIR/config/tmp.sh" "$TST_STRESS_DIR/config/run-${md5_now}-${cmd_index}.sh"
        md5_last="$md5_now"
    done
    # 将config生成的执行脚本更新到执行目录
    local script_name
    if ls "$TST_STRESS_DIR"/running/*.sh >/dev/null 2>&1; then
        for s in "$TST_STRESS_DIR"/running/*.sh; do
            script_name="$(basename "$s")"
            if [ -f "$TST_STRESS_DIR/config/$script_name" ]; then
                rm "$TST_STRESS_DIR/config/$script_name"
            else
                rm "$s"
            fi
        done
    fi
    if ls "$TST_STRESS_DIR"/config/run-*.sh >/dev/null 2>&1; then
        for s in "$TST_STRESS_DIR"/config/run-*.sh; do
            script_name="$(basename "$s")"
            mv "$s" "$TST_STRESS_DIR/running"
        done
    fi
    rm "$TST_STRESS_DIR"/config/*.sh 2>/dev/null
    return 0
}

# 定时更新config文件
update_config() {
    while is_stress_running; do
        sleep $TST_STRESS_CONFIG_INTERVAL
        process_config
    done
}

# 循环执行指定的压力测试脚本
# $1 -- script
stress_loop() {
    local script="$1"
    local loop_count=0
    local log_index=0
    local log_file
    local ret_code
    local script_name
    local time_start
    local time_cost

    script_name="$(basename "$script")"
    mkdir -p "$TST_STRESS_DIR/logs"
    # 脚本不存在，直接返回
    [ -f "$script" ] || return 1
    # 脚本已经有进程在运行了，直接返回
    [ -f "$script".pid ] && [ -d "/proc/$(cat "$script".pid)" ] && return 0
    cp "$script" "$TST_STRESS_DIR/logs/"
    rm -rf "$script".pid
    echo $$ >"$script".pid
    while is_stress_running && [ -f "$script" ]; do
        log_index=$((log_index + 1))
        loop_count=$((loop_count + 1))
        log_file="$TST_STRESS_DIR/logs/${script_name}-${log_index}.txt"
        time_start=$(get_up_time_sec)
        /bin/sh "$script" >"$log_file" 2>&1
        ret_code=$?
        time_cost=$(($(get_up_time_sec) - time_start))
        echo "$(date '+%Y%m%d-%H%M%S.%N') $script_name loop $loop_count cost $time_cost return $ret_code" |
            tee -a "$TST_STRESS_DIR/result.txt" |
            tee -a "$log_file"
        [ "$log_index" == "$TST_STRESS_MAX_LOG" ] && log_index=0
        sleep $TST_STRESS_LOOP_INTERVAL
    done
    rm -rf "$script".pid
    return 0
}

# 获取系统资源信息，用于判断压力测试过程中是否存在资源泄漏
# $1 -- 采集计数
get_resource_summary() {
    echo "# $(date '+%Y%m%d-%H%M%S.%N') cpu $(head -n 1 /proc/stat)"
    echo "# $(date '+%Y%m%d-%H%M%S.%N') memory $(grep "MemFree:\|MemAvailable:\|Buffers:\|Cached:" /proc/meminfo |
        awk '{print $1$2}' | tr '\n' ' ')"
    echo "# $(date '+%Y%m%d-%H%M%S.%N') file $(cat /proc/sys/fs/file-nr)"
}

# 生成一个获取系统详细资源信息的脚本文件
gen_cmd_resource_detail() {
    [ -x "$TST_TS_TOPDIR/logs/stress-current/get-resource-detail" ] && return
    cat >"$TST_TS_TOPDIR/logs/stress-current/get-resource-detail" <<-EOF
#!/bin/sh
set -x
date '+%Y%m%d-%H%M%S.%N'
cat /proc/uptime
cat /proc/stat
cat /proc/meminfo
cat /proc/buddyinfo
cat /proc/slabinfo
cat /proc/sys/fs/file-nr
cat /proc/vmstat
cat /proc/vmallocinfo
cat /proc/interrupts
cat /proc/zoneinfo
cat /proc/timer_list
cat /proc/softirqs
cat /proc/sys/fs/inode-state
cat /proc/sys/fs/dentry-state
cat /proc/sys/fs/nr_open
ipcs
df -i
df -k
ps axfww
EOF
    chmod +x "$TST_TS_TOPDIR/logs/stress-current/get-resource-detail"
}

# 获取系统资源信息，用于判断压力测试过程中是否存在资源泄漏
get_resource_detail() {
    [ -x "$TST_TS_TOPDIR/logs/stress-current/get-resource-detail" ] || gen_cmd_resource_detail
    "$TST_TS_TOPDIR/logs/stress-current/get-resource-detail" 2>&1
}

gen_stress_monitor() {
    cat >"$TST_TS_TOPDIR/logs/tst-stress-monitor" <<-EOF
#!/bin/sh
main(){
    date '+%Y%m%d-%H%M%S-%N'
    for s in \$(systemctl list-units | grep -wo "tst-stress-[a-z0-9]\{8\}\.service" | sort | uniq); do
        systemctl --no-pager --full status \$s
        systemctl stop \$s
    done
    sleep 10
    "$TST_TS_TOPDIR/logs/stress-current/get-resource-detail" 2>&1
    for s in \$(systemctl list-units | grep -wo "tst-stress-[a-z0-9]\{8\}\.service" | sort | uniq); do
        systemctl start \$s
        systemctl --no-pager --full status \$s
    done
}
main >>"$TST_TS_TOPDIR/logs/stress-current/stress-monitor.txt" 2>&1
EOF
    chmod +x "$TST_TS_TOPDIR/logs/tst-stress-monitor"
}

# 启动一个定时任务，监控系统中的压力测试，定期将压力测试停掉，获取系统资源信息后再重新拉起压力测试
stress_monitor() {
    # 系统还没有monitor任务，创建一个
    gen_cmd_resource_detail
    gen_stress_monitor
    sed -i "/tst-stress-monitor/d" /var/spool/cron/root
    echo "* * * * * $TST_TS_TOPDIR/logs/tst-stress-monitor" >>/var/spool/cron/root
    crontab -l
}

# 启动systemd服务管理压力测试
stress_daemon() {
    local daemon_id
    daemon_id="$(realpath "$TST_TS_TOPDIR" | md5sum | head -c 8)"
    local service_name="tst-stress-${daemon_id}"

    echo "the tst-stress run in daemon mode"

    if systemctl --no-pager --full status "${service_name}.service" >/dev/null 2>&1; then
        echo "the daemon exist, enable it: ${service_name}.service"
        systemctl --no-pager --full cat "${service_name}.service"
        systemctl --no-pager --full status "${service_name}.service"
        systemctl enable "${service_name}.service"
        return $?
    elif systemctl --no-pager --full list-units | grep -w "${service_name}.service"; then
        echo "the ${service_name}.service load, but not running, try rebuild"
    elif systemctl --no-pager --full list-units | grep -w "${service_name}.service"; then
        echo "the ${service_name}.service exist, but not load, try rebuild"
    else
        echo "try build ${service_name}.service"
    fi

    {
        echo "[Unit]"
        echo "Description=TencentOS Stress Test Tool"
        echo "After=network.target"
        echo ""
        echo "[Service]"
        echo "Type=simple"
        echo "ExecStartPre=$TST_TS_TOPDIR/tsuite setup"
        echo "ExecStart=$TST_TS_TOPDIR/tsuite stress $*"
        echo "ExecStopPost=$TST_TS_TOPDIR/tsuite teardown"
        echo "KillSignal=$TST_STRESS_STOP_SIGNAL"
        echo "Restart=always"
        echo "RestartSec=60s"
        echo ""
        echo "[Install]"
        echo "WantedBy=multi-user.target"
    } >"$TST_TS_TOPDIR/logs/${service_name}.service"
    cp -v "$TST_TS_TOPDIR/logs/${service_name}.service" /usr/lib/systemd/system/"${service_name}.service"

    stress_monitor
    systemctl daemon-reload
    systemctl start "${service_name}"
    systemctl enable "${service_name}"

    echo "the tst-stress running, you can do something like:"
    echo "get status: systemctl status ${service_name}.service"
    echo "pause test: systemctl stop ${service_name}.service"
    echo "stop test: systemctl disable ${service_name}.service"
}

# Ctrl+C终止测试时执行此函数
handler_signal() {
    echo "stress test get signal"
    rm -rfv "$TST_STRESS_DIR/flag.run"
    kill -12 0
}

# 执行压力测试
# $1 -- 压力测试配置文件
stress_run() {
    local input_config="$1"
    local uptime_count
    local main_loop_count=0

    uptime_count=$(get_up_time_sec)
    mkdir -p "$TST_STRESS_DIR/running" "$TST_STRESS_DIR/config"
    cp "$input_config" "$TST_STRESS_DIR/test.config"
    echo "/proc/$$" >"$TST_STRESS_DIR/flag.run"

    process_config
    update_config &

    get_resource_detail >"$TST_STRESS_DIR/resource.detail.txt"
    get_resource_detail >"$TST_TS_TOPDIR/logs/stress-current/stress-monitor.txt"
    while is_stress_running; do
        main_loop_count=$((main_loop_count + 1))
        # 每次循环获取一次资源基础信息
        get_resource_summary $main_loop_count
        # 每30分钟获取一次详细资源信息
        if [ "$(get_up_time_sec)" -gt $((uptime_count + 600)) ]; then
            uptime_count=$((uptime_count + 600))
            get_resource_detail >>"$TST_STRESS_DIR/resource.detail.txt"
        fi
        # 执行测试
        for s in "$TST_STRESS_DIR"/running/*.sh; do
            stress_loop "$s" &
        done
        sleep $TST_STRESS_RUNNING_INTERVAL
    done
    return 0
}

# 停止压力测试
stress_stop() {
    echo "remove crond task tst-stress-monitor"
    sed -i "/tst-stress-monitor/d" /var/spool/cron/root
    crontab -l
    for s in $(systemctl list-units | grep -wo "tst-stress-[a-z0-9]\{8\}\.service" | sort | uniq); do
        [ -z "$s" ] && continue
        echo "stop systemd service: $s"
        systemctl --no-pager --full status "$s"
        systemctl stop "$s"
        systemctl disable "$s"
        systemctl unmask "$s"
        find /usr/lib/systemd/system/ /etc/systemd/system/ -name "*${s}*" -print |
            grep -w "tst-stress" |
            xargs rm -rfv
        systemctl daemon-reload
        systemctl reset-failed
    done
}

stress_main() {
    local input_config
    local uptime_count
    local daemon_args=""

    trap handler_signal 2

    echo "stress_main $*"

    while [ $# -gt 0 ]; do
        case "$1" in
            "-d")
                export TST_STRESS_MODE=daemon
                shift
                ;;
            "-c")
                shift
                if [ -z "$input_config" ]; then
                    input_config="$1"
                    daemon_args="$daemon_args -c $1"
                else
                    echo "stress arg error: $1"
                    return 1
                fi
                shift
                ;;
            "start")
                echo "start stress testing: $*"
                daemon_args="$daemon_args $1"
                shift
                break
                ;;
            "stop")
                echo "stop stress testing: $*"
                stress_stop
                return 0
                ;;
            *)
                echo "unknown args: $*"
                return 1
                ;;

        esac
    done

    if [ ! -f "$input_config" ]; then
        echo "stress config file $1 not exist"
        return 1
    fi

    # 如果压力测试运行在daemon模式，那么不用往下执行了
    if [ "$TST_STRESS_MODE" == "daemon" ]; then
        stress_daemon "$daemon_args"
        return 0
    fi

    local store_index=0
    # 保留TST_STRESS_NR_STORE次测试的日志
    # shellcheck disable=SC2010
    for log_dir in $(ls "$TST_TS_TOPDIR/logs" | grep "^stress-" | sort); do
        store_index=$((store_index + 1))
        [ $store_index -lt $TST_STRESS_NR_STORE ] && continue
        rm -rfv "$TST_TS_TOPDIR/logs/$log_dir"
    done

    mkdir -p "$TST_STRESS_DIR"
    rm -rf "$TST_TS_TOPDIR/logs/stress-current"
    ln -s "./$(basename "$TST_STRESS_DIR")" "$TST_TS_TOPDIR/logs/stress-current"
    stress_run "$input_config" 2>&1 | tee -a "$TST_STRESS_DIR/run.txt"
    return 0
}
